Turn columns into the right format
d1 = d1 %>%
mutate(host_id = as.numeric(host_id)) %>%
mutate(latitude = as.numeric(latitude)) %>%
mutate(longitude = as.numeric(longitude)) %>%
mutate(price = as.numeric(price)) %>%
mutate(minimum_nights = as.numeric(minimum_nights)) %>%
mutate(number_of_reviews = as.numeric(number_of_reviews)) %>%
mutate(reviews_per_month = as.numeric(reviews_per_month)) %>%
mutate(calculated_host_listings_count = as.numeric(calculated_host_listings_count)) %>%
mutate(availability_365 = as.numeric(availability_365)) %>%
na.omit() %>%
rename(desc = name,ID = host_id, listing_host = calculated_host_listings_count, availability = availability_365)
## * Dropped 7 rows with 'na.omit' (46941 => 46934)
length = sdf_nrow(d1)
We’ll start by plotting distribution of all numerical variables.
#histogram
plot_histogram = function(col, data = d1) {
plot_data = data %>% sdf_read_column(col) %>% as.numeric() %>% as.data.frame()
colnames(plot_data) = col
plot = ggplot(plot_data, aes_string(x = col))+
geom_histogram(bins = 30, fill="#1F77B4")+
theme_few()
labs(title = col,
x = "",
y = "")+
theme(plot.title = element_text(hjust = 0.5, size=9),
axis.ticks.y = element_blank(),
axis.text.y = element_blank())
return(plot)
}
plot_box = function(col, data = d1) {
plot_data = data %>% sdf_read_column(col) %>% as.numeric() %>% as.data.frame()
colnames(plot_data) = col
plot = ggplot(plot_data, aes_string(y = col))+
geom_boxplot()+
theme_few()+
labs(title = col,
x = "",
y = "")+
theme(plot.title = element_text(hjust = 0.5, size=9))
return(plot)
}
Loop over numeric columns and plot histograms
numeric_columns = colnames(d1)[c(5,6,8,9,10,12:14)]
histograms = list()
for (i in 1:length(numeric_columns)){
hist = plot_histogram(numeric_columns[i])
histograms[[numeric_columns[i]]] = hist
}
#arrange histograms in 2x4 grid
do.call("grid.arrange", c(histograms, ncol = 2))
#save the grid as png
hist_grid = do.call("arrangeGrob", c(histograms, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/histogram_grid.png"), hist_grid, width = 7, height = 7)
boxplots = list()
for (i in 1:length(numeric_columns)){
box = plot_box(numeric_columns[i])
boxplots[[numeric_columns[i]]] = box
}
#arrange histograms in 2x4 grid
do.call("grid.arrange", c(boxplots, ncol = 2))
#save the grid as png
box_grid = do.call("arrangeGrob", c(boxplots, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/boxplot_grid.png"), box_grid)
## Saving 7 x 5 in image
We’ll start by looking for nonsense values (e.g. availability more than 365) and explore whether it needs transformations, excluding outliers etc.
d1 %>%
sdf_describe("availability")
## # Source: spark<?> [?? x 2]
## summary availability
## <chr> <chr>
## 1 count 46934
## 2 mean 110.44541270720586
## 3 stddev 130.9620570919105
## 4 min 0.0
## 5 max 365.0
#look at its histogram in detail
histograms$availability
boxplots$availability
#there is at least one listing that is never available
cat("How many accomodations are never available?",
d1 %>% filter(availability==0) %>% sdf_nrow())
## How many accomodations are never available? 17209
#on the other hand, how many are always available
cat("\nHow many accomodations are always available?",
d1 %>% filter(availability==365) %>% sdf_nrow())
##
## How many accomodations are always available? 1199
It seems most listing are available only for very narrow time windows. However, there is quite high variation and so the mean and standard deviations are shifted away from the mode. Altough log-transform might help to achieve normal distribution, we’d loose data as there are many 0 values (log(0)=infinity).
d1 %>%
sdf_describe("minimum_nights")
## # Source: spark<?> [?? x 2]
## summary minimum_nights
## <chr> <chr>
## 1 count 46934
## 2 mean 7.115609153279073
## 3 stddev 20.634444444815273
## 4 min 1.0
## 5 max 1250.0
#look at histogram and boxplot
histograms$minimum_nights
boxplots$minimum_nights
cat("How many accomodations have minimum nights a year or more?",
d1 %>% filter(minimum_nights >= 365) %>% sdf_nrow())
## How many accomodations have minimum nights a year or more? 41
cat("How many accomodations have minimum nights more than mean + 2.5*standard deviation?",
d1 %>% filter(minimum_nights >= mean(minimum_nights, na.rm = T)+2.5*sd(minimum_nights, na.rm = T)) %>% sdf_nrow())
## Warning: `lang_name()` is deprecated as of rlang 0.2.0.
## Please use `call_name()` instead.
## This warning is displayed once per session.
## Warning: `lang()` is deprecated as of rlang 0.2.0.
## Please use `call2()` instead.
## This warning is displayed once per session.
## How many accomodations have minimum nights more than mean + 2.5*standard deviation? 427
There are 427 datapoints that are more than 2.5 SDs away from mean, it’s safe to say that these are outliers, we’ll remove them later for modelling purposes.
#d1 = d1 %>%
# filter(minimum_nights <= mean(minimum_nights, na.rm = T)+2.5*sd(minimum_nights, na.rm = T))
#replot histogram
#plot_histogram("minimum_nights")
#plot_box("minimum_nights")
There are still some possible outliers but the spread is significantly lower. Regression models should be able to handle extrapolating these more extreme values.
d1 %>% sdf_describe("number_of_reviews")
## # Source: spark<?> [?? x 2]
## summary number_of_reviews
## <chr> <chr>
## 1 count 46934
## 2 mean 23.18696467379725
## 3 stddev 44.71422440863874
## 4 min 0.0
## 5 max 629.0
#look at plots
histograms$number_of_reviews
boxplots$number_of_reviews
We can see some extreme values again.
d1 %>%
filter(number_of_reviews >= mean(number_of_reviews, na.rm = T)+2.5*sd(number_of_reviews, na.rm = T)) %>%
sdf_nrow()
## [1] 1688
d1 %>%
filter(number_of_reviews == 0) %>%
sdf_nrow()
## [1] 9678
d1 %>% sdf_describe("reviews_per_month")
## # Source: spark<?> [?? x 2]
## summary reviews_per_month
## <chr> <chr>
## 1 count 46934
## 2 mean 1.073917629010965
## 3 stddev 1.585515640503751
## 4 min 0.0
## 5 max 58.5
histograms$reviews_per_month
boxplots$reviews_per_month
Most listings receive approx 1 review/month with a few extremes.
d1 %>% sdf_describe("listing_host")
## # Source: spark<?> [?? x 2]
## summary listing_host
## <chr> <chr>
## 1 count 46934
## 2 mean 7.341202539736652
## 3 stddev 33.595713063100774
## 4 min 1.0
## 5 max 327.0
histograms$listing_host
This feature is a bit misleading since there are duplicates (listings from the same host). Let’s recompute the statistics by looking at unique values.
listings = d1 %>% select(ID, listing_host) %>% distinct() %>% arrange(desc(listing_host))
listings %>% sdf_describe("listing_host")
## # Source: spark<?> [?? x 2]
## summary listing_host
## <chr> <chr>
## 1 count 36081
## 2 mean 1.3045092985227682
## 3 stddev 2.8070101327169055
## 4 min 1.0
## 5 max 327.0
#replot histogram and boxplot
(hist_listings = plot_histogram("listing_host", listings))
(box_listings = plot_box("listing_host", listings))
#update grids of plots
histograms[["listing_host"]] = hist_listings
boxplots[["listing_host"]] = box_listings
#save to png again
box_grid = do.call("arrangeGrob", c(boxplots, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/boxplot_grid.png"), box_grid)
## Saving 7 x 5 in image
hist_grid = do.call("arrangeGrob", c(histograms, ncol = 2))
ggsave(paste0(VISUALIZATIONS, "/histogram_grid.png"), hist_grid, width = 7, height = 7)
Most hosts have only 1 listing on average. However, there are clearly some who have sort of a network of listings. Let’s look at the top 10.
#collect listings to R
listing_data = sdf_collect(listings)
listing_data$ID = as.factor(listing_data$ID)
#plot trend of owning airbnb
(top_hosts = ggplot(listing_data[1:10,], aes(x = reorder(ID, -listing_host), y = listing_host, fill = ID))+
geom_bar(stat = "identity")+
theme_few()+
scale_fill_tableau()+
guides(fill = F)+
labs(x = "",
y = "Listings")+
theme(axis.text.x = element_text(angle = 45, hjust = 1)))
ggsave(paste0(VISUALIZATIONS, "/top_hosts_listings.png"), top_hosts)
## Saving 7 x 5 in image
#IDs for later with more than 10
topID = listing_data %>% filter(listing_host>=10) %>% select(ID)
topID = as.numeric(as.character(topID$ID))
d1 %>% sdf_describe("price")
## # Source: spark<?> [?? x 2]
## summary price
## <chr> <chr>
## 1 count 46934
## 2 mean 154.30706950185368
## 3 stddev 240.57117624168677
## 4 min 0.0
## 5 max 10000.0
histograms$price
boxplots$price
There are some ridiculously expensive listings. On the other hand, there’s at least one listing which is offered for free. To explore the distribution of more common prices, we’ll remove these extremes for now.
cat("There are", d1 %>% filter(price == 0) %>% sdf_nrow(), "listings offered for free.")
## There are 10 listings offered for free.
#Let's at their descriptions
d1 %>% filter(price == 0) %>% select(desc) #these mostly look like it's a some invalid data
## # Source: spark<?> [?? x 1]
## desc
## <chr>
## 1 Huge Brooklyn Brownstone Living, Close to it all.
## 2 MARTIAL LOFT 3: REDEMPTION (upstairs, 2nd room)
## 3 Sunny, Quiet Room in Greenpoint
## 4 Modern apartment in the heart of Williamsburg
## 5 Spacious comfortable master bedroom with nice view
## 6 Contemporary bedroom in brownstone with nice view
## 7 Cozy yet spacious private brownstone bedroom
## 8 the best you can find
## 9 Coliving in Brooklyn! Modern design / Shared room
## 10 Best Coliving space ever! Shared room.
#remove those free places immediately
d1 = d1 %>% filter(price > 0)
#Let's look at descriptions of the most expensive listings
d1 %>% filter(price > 5000) %>% select(price,desc) %>% arrange(desc(price))
## # Source: spark<?> [?? x 2]
## # Ordered by: desc(price)
## price desc
## <dbl> <chr>
## 1 10000 Luxury 1 bedroom apt. -stunning Manhattan views
## 2 10000 Furnished room in Astoria apartment
## 3 10000 1-BR Lincoln Center
## 4 9999 2br - The Heart of NYC: Manhattans Lower East Side
## 5 9999 Spanish Harlem Apt
## 6 9999 Quiet, Clean, Lit @ LES & Chinatown
## 7 8500 Beautiful/Spacious 1 bed luxury flat-TriBeCa/Soho
## 8 8000 Film Location
## 9 7703 East 72nd Townhouse by (Hidden by Airbnb)
## 10 7500 Gem of east Flatbush
## # … with more rows
prices = d1 %>% filter(price < 600)
plot_histogram("price", prices)
Summarise the categorical columns - count values per category
(neigbourhood_gr_count = d1 %>%
group_by(neighbourhood_group) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
rename(borough = neighbourhood_group) %>%
sdf_collect() %>%
as.data.frame())
## borough count
## 1 Manhattan 21543
## 2 Brooklyn 19843
## 3 Queens 5190
## 4 Bronx 348
neigbourhood_count = d1 %>%
group_by(neighbourhood) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
sdf_collect() %>%
as.data.frame()
neigbourhood_count[1:10,]
## neighbourhood count
## 1 Williamsburg 3912
## 2 Bedford-Stuyvesant 3699
## 3 Harlem 2642
## 4 Bushwick 2457
## 5 Upper West Side 1968
## 6 Hell's Kitchen 1951
## 7 East Village 1849
## 8 Upper East Side 1791
## 9 Crown Heights 1559
## 10 Midtown 1541
(room_count = d1 %>%
group_by(room_type) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
sdf_collect() %>%
as.data.frame())
## room_type count
## 1 Entire home/apt 24572
## 2 Private room 21278
## 3 Shared room 1074
Plot the counts
categ1 = ggplot(neigbourhood_gr_count, aes(x = reorder(borough, -count), y = count, fill = borough))+
geom_bar(stat = "identity")+
theme_few()+
scale_fill_tableau()+
labs(x = "")+
guides(fill = F)
categ2 = ggplot(neigbourhood_count[1:10,], aes(x = reorder(neighbourhood, -count), y = count, fill = neighbourhood))+
geom_bar(stat = "identity")+
theme_few()+
scale_fill_tableau()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x = "")+
guides(fill = F)
categ3 = ggplot(room_count, aes(x = reorder(room_type, -count), y = count, fill = room_type))+
geom_bar(stat = "identity")+
theme_few()+
scale_fill_tableau()+
labs(x = "")+
guides(fill = F)
#arrange plots in nice layout
(categorical_counts = (categ1 + categ3)/categ2)
ggsave(paste0(VISUALIZATIONS, "/categorical_counts.png"), categorical_counts)
## Saving 7 x 5 in image
Becaue we’re mainly interested in prices, we’ll now explore the relationship of price and other variables.
price_bor = d1 %>%
group_by(neighbourhood_group) %>%
summarise(mean = mean(price),
sd = sd(price)) %>%
sdf_collect()
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
(price_bor_bar = ggplot(price_bor, aes(x = reorder(neighbourhood_group, -mean), y = mean, fill = neighbourhood_group))+
geom_col(color = "black", size = 0.3)+
geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd))+
theme_few()+
scale_fill_tableau()+
guides(fill = F)+
labs(x = "",
y = "price")+
theme(axis.text.x = element_text(angle = 30, hjust = 1)))
ggsave(paste0(VISUALIZATIONS, "/price_borough.png"), price_bor_bar)
## Saving 7 x 5 in image
price_nei = d1 %>%
group_by(neighbourhood) %>%
summarise(mean = mean(price),
sd = sd(price)) %>%
arrange(desc(mean)) %>%
sdf_collect()
(price_nei_point = ggplot(price_nei, aes(x = reorder(neighbourhood, -mean), y = mean))+
geom_point()+
theme_few()+
labs(x = "",
y = "price"))+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 4))
(price_nei_bar = ggplot(price_nei[1:10,], aes(x = reorder(neighbourhood, -mean), y = mean, fill = neighbourhood))+
geom_col(color = "black", size = 0.3)+
geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd), width = 0.5)+
theme_few()+
scale_fill_tableau()+
guides(fill = F)+
labs(x = "",
y = "price")+
theme(axis.text.x = element_text(angle = 45, hjust = 1)))
ggsave(paste0(VISUALIZATIONS, "/price_neighbourhoood_all.png"), price_nei_point)
## Saving 7 x 5 in image
ggsave(paste0(VISUALIZATIONS, "/price_neighbourhoood_top10.png"), price_nei_bar)
## Saving 7 x 5 in image
(price_type = d1 %>%
group_by(room_type) %>%
summarise(mean = mean(price),
sd = sd(price)) %>%
arrange(desc(mean)) %>%
sdf_collect())
## # A tibble: 3 x 3
## room_type mean sd
## <chr> <dbl> <dbl>
## 1 Entire home/apt 213. 285.
## 2 Private room 90.3 157.
## 3 Shared room 71.2 103.
(price_type_bar = ggplot(price_type, aes(x = reorder(room_type, -mean), y = mean, fill = room_type))+
geom_col(color = "black", size = 0.3)+
geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd), width = 0.5)+
theme_few()+
scale_fill_tableau()+
guides(fill = F)+
labs(x = "",
y = "price")+
theme(axis.text.x = element_text(angle = 30, hjust = 1)))
Let’s look whether there is interaction between borough and room type that explains the price variations.
borough_type = d1 %>%
mutate(borough_type = paste(neighbourhood_group, room_type, sep = "-")) %>%
group_by(borough_type) %>%
summarise(mean = mean(price),
sd = sd(price)) %>%
sdf_collect() %>%
tidyr::separate(borough_type, c("borough", "room_type"), sep = "-")
(borough_type_bar = ggplot(borough_type, aes(fill=room_type, y=mean, x=borough))+
geom_col(position="dodge", color = "black", size = 0.3)+
geom_errorbar(aes(ymin =mean-0.5*sd, ymax = mean+0.5*sd), position = position_dodge(0.9), width = 0.5)+
theme_few()+
scale_fill_tableau(name = "Room type")+
labs(x = "",
y = "price"))
(bor_type_layout = (price_bor_bar + price_type_bar) / borough_type_bar)
ggsave(paste0(VISUALIZATIONS, "/borough_type.png"), bor_type_layout, width = 7, height = 6)
First, we create a background map of New York City + some detailed maps of boroughs
register_google(key = "AIzaSyB_CZHsa2vTarsq5zlDxB-CbMnCGj7xa1s")
#New York
lat = d1 %>% summarise(lat = mean(latitude)) %>% sdf_collect %>% as.numeric()
lon = d1 %>% summarise(lon = mean(longitude)) %>% sdf_collect %>% as.numeric()
nyc = get_googlemap(center = c(lon = lon, lat = lat),
zoom = 11, scale = 2,
maptype = "roadmap",
color = "color")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.728528,-73.953523&zoom=11&size=640x640&scale=2&maptype=roadmap&key=xxx-CbMnCGj7xa1s
price_data = d1 %>%
select(latitude, longitude, price, neighbourhood_group) %>%
filter(price <500)
(price_map = ggmap(nyc)+
geom_point(data = price_data,aes(x = longitude, y = latitude, color = price), alpha = 0.8)+
theme_map()+
scale_color_gradient2_tableau())
## Warning: Removed 50 rows containing missing values (geom_point).
ggsave(paste0(VISUALIZATIONS, "/price_map.png"), price_map)
## Saving 7 x 5 in image
## Warning: Removed 50 rows containing missing values (geom_point).
multi_hosts = d1 %>% filter(ID %in% topID) %>% sdf_collect()
lat = multi_hosts %>% summarise(lat = mean(latitude)) %>% as.numeric()
lon = multi_hosts %>% summarise(lon = mean(longitude)) %>% as.numeric()
nyc = get_googlemap(center = c(lon = lon, lat = lat),
zoom = 12, scale = 2,
maptype = "roadmap",
color = "color")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.739068,-73.968391&zoom=12&size=640x640&scale=2&maptype=roadmap&key=xxx-CbMnCGj7xa1s
ggmap(nyc) +
geom_line(data = multi_hosts,aes(x = longitude, y = latitude, color = factor(ID)))+
geom_point(data = multi_hosts,aes(x = longitude, y = latitude, color = factor(ID)),size=0.3)+
guides(color=F)+
theme_void()+
scale_color_manual(values = rainbow(length(unique(multi_hosts$ID))))
## Warning: Removed 119 rows containing missing values (geom_path).
## Warning: Removed 134 rows containing missing values (geom_point).
cols = rep("#002a75", 36079)
(network = ggplot(d1,aes(x = longitude, y = latitude, color=factor(ID)))+
geom_line(aes(alpha=price))+
geom_point(aes(alpha=price), size=0.01)+
guides(color=F, alpha=F)+
theme_void()+
scale_color_manual(values = cols)+
theme(plot.background = element_rect(fill="black", color="black")))
golden_ratio = (1+sqrt(5))/2
n = 7
ggsave(paste0(VISUALIZATIONS, "/network_full.png"), network, dpi=600, height = n, width = n*golden_ratio)